from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
pd.options.display.max_columns = 999
%matplotlib inline
Nov 16, 2022
We'll load data compiled from two data sources:
# Load the data
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()
| Country | life_satisfaction | gdp_per_capita | |
|---|---|---|---|
| 0 | Australia | 7.3 | 50961.87 |
| 1 | Austria | 7.1 | 43724.03 |
| 2 | Belgium | 6.9 | 40106.63 |
| 3 | Brazil | 6.4 | 8670.00 |
| 4 | Canada | 7.4 | 43331.96 |
# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
# Model selection
from sklearn.model_selection import train_test_split
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
First step: set up the test/train split of input data:
# Do a 70/30 train/test split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]
X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]
# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values
As we increase the polynomial degree (add more and more polynomial features) two things happen:
This is the classic case of overfitting — our model does not generalize well at all.
Ridge adds regularization to the linear regression least squares modelRemember, regularization penalizes large parameter values and complex fits
Let's gain some intuition:
Important
LinearModel and scales input features with StandardScalerStandardScaler and PolynomialFeatures(degree=3) pre-processing to featuresSet up a grid of GDP per capita points to make predictions for:
# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)
# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]
# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))
# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
## Plot the data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Data",
s=100,
zorder=10,
color="#666666",
)
## Evaluate the linear fit
print("Linear fit")
training_score = linear.score(scaler.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
test_score = linear.score(scaler.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
print()
## Plot the linear fit
ax.plot(
X_pred / 1e5,
linear.predict(scaler.fit_transform(X_pred)),
color="k",
label="Linear fit",
)
## Ridge regression: linear model with regularization
# Plot the predicted values for each alpha
for alpha in [0, 10, 100, 1e5]:
print(f"alpha = {alpha}")
# Create out Ridge model with this alpha
ridge = Ridge(alpha=alpha)
# Fit the model on the training set
# NOTE: Use the pipeline that includes polynomial features
ridge.fit(pipe.fit_transform(X_train), y_train)
# Evaluate on the training set
training_score = ridge.score(pipe.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
# Evaluate on the test set
test_score = ridge.score(pipe.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
# Plot the ridge results
y_pred = ridge.predict(pipe.fit_transform(X_pred))
ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")
print()
# Plot formatting
ax.legend(ncol=2, loc=0)
ax.set_ylim(4, 8)
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
Linear fit Training Score = 0.4638100579740343 Test Score = 0.35959585147159556 alpha = 0 Training Score = 0.6458898101593082 Test Score = 0.5597457659851048 alpha = 10 Training Score = 0.5120282691427858 Test Score = 0.38335642103788325 alpha = 100 Training Score = 0.1815398751108913 Test Score = -0.05242399995626967 alpha = 100000.0 Training Score = 0.0020235571180508005 Test Score = -0.26129559971586125
LinearRegression() with the polynomial featuresMore feature engineering!
In this case, I've done the hard work for you and pulled additional country properties from the OECD's website.
data2 = pd.read_csv("./data/gdp_vs_satisfaction_more_features.csv")
data2.head()
| Country | life_satisfaction | GDP per capita | Air pollution | Employment rate | Feeling safe walking alone at night | Homicide rate | Life expectancy | Quality of support network | Voter turnout | Water quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Australia | 7.3 | 50961.87 | 5.0 | 73.0 | 63.5 | 1.1 | 82.5 | 95.0 | 91.0 | 93.0 |
| 1 | Austria | 7.1 | 43724.03 | 16.0 | 72.0 | 80.6 | 0.5 | 81.7 | 92.0 | 80.0 | 92.0 |
| 2 | Belgium | 6.9 | 40106.63 | 15.0 | 63.0 | 70.1 | 1.0 | 81.5 | 91.0 | 89.0 | 84.0 |
| 3 | Brazil | 6.4 | 8670.00 | 10.0 | 61.0 | 35.6 | 26.7 | 74.8 | 90.0 | 79.0 | 73.0 |
| 4 | Canada | 7.4 | 43331.96 | 7.0 | 73.0 | 82.2 | 1.3 | 81.9 | 93.0 | 68.0 | 91.0 |
We'll move beyond simple linear regression and see if we can get a better fit.
A decision tree learns decision rules from the input features:
For a specific corner of the input feature space, the decision tree predicts an output target value
Decision trees can be very deep with many nodes — this will lead to overfitting your dataset!
This is an example of ensemble learning: combining multiple estimators to achieve better overall accuracy than any one model could achieve
from sklearn.ensemble import RandomForestRegressor
Let's split our data into training and test sets again:
# Split the data 70/30
train_set, test_set = train_test_split(data2, test_size=0.3, random_state=42)
# the target labels
y_train = train_set["life_satisfaction"].values
y_test = test_set["life_satisfaction"].values
# The features
feature_cols = [col for col in data2.columns if col not in ["life_satisfaction", "Country"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
import seaborn as sns
sns.heatmap(
train_set[feature_cols].corr(),
cmap="coolwarm",
annot=True,
vmin=-1,
vmax=1
);
New: Pipelines support models as the last step!
Pipeline behaves just like a model, but it runs the transformations beforehand.fit() function of the pipeline instead of the modelEstablish a baseline with a linear model:
# Linear model pipeline with two steps
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Fit the pipeline
# NEW: This applies all of the transformations, and then fits the model
print("Linear regression")
linear_pipe.fit(X_train, y_train)
# NEW: Print the training score
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# NEW: Print the test score
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Linear regression Training Score = 0.755333265746168 Test Score = 0.6478865590446832
Now fit a random forest:
# Random forest model pipeline with two steps
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, max_depth=2, random_state=42)
)
# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)
# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")
# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")
Random forest Training Score = 0.8460559576380231 Test Score = 0.862756366860489
Because random forests are an ensemble method with multiple estimators, the algorithm can learn which features help improve the fit the most.
feature_importances_ attributefit()!# What are the named steps?
forest_pipe.named_steps
{'standardscaler': StandardScaler(),
'randomforestregressor': RandomForestRegressor(max_depth=2, random_state=42)}
# Get the forest model
forest_model = forest_pipe['randomforestregressor']
forest_model.feature_importances_
array([0.67746013, 0.0038663 , 0.13108609, 0.06579352, 0.00985913,
0.01767323, 0.02635605, 0.00601776, 0.06188779])
# Make the dataframe
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance
| Feature | Importance | |
|---|---|---|
| 1 | Air pollution | 0.003866 |
| 7 | Voter turnout | 0.006018 |
| 4 | Homicide rate | 0.009859 |
| 5 | Life expectancy | 0.017673 |
| 6 | Quality of support network | 0.026356 |
| 8 | Water quality | 0.061888 |
| 3 | Feeling safe walking alone at night | 0.065794 |
| 2 | Employment rate | 0.131086 |
| 0 | GDP per capita | 0.677460 |
# Plot
importance.hvplot.barh(
x="Feature", y="Importance", title="Does Money Make You Happier?"
)
For more information, see the scikit-learn docs
The cross_val_score() function will automatically partition the training set into k folds, fit the model to the subset, and return the scores for each partition.
It takes a Pipeline object, the training features, and the training labels as arguments
from sklearn.model_selection import cross_val_score
cross_val_score?
# Make a linear pipeline
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())
# Run the 3-fold cross validation
scores = cross_val_score(
linear_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [ 0.02064625 -0.84773581 -0.53652985] Scores mean = -0.45453980429946145 Score std dev = 0.35922474493059114
# Make a random forest pipeline
forest_pipe = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 3-fold cross validation
scores = cross_val_score(
forest_pipe,
X_train,
y_train,
cv=3,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.51950656 0.78033403 0.66526384] Scores mean = 0.655034812029183 Score std dev = 0.10672774411781198
n_estimators is a model hyperparameter(via the fit() method)This is when cross validation becomes very important
from sklearn.model_selection import GridSearchCV
Let's do a search over the n_estimators parameter and the max_depth parameter:
# Create our regression pipeline
pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
pipe
Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(random_state=42))])StandardScaler()
RandomForestRegressor(random_state=42)
"[step name]__[parameter name]"
model_step = "randomforestregressor"
param_grid = {
f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}
param_grid
{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200],
'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51]}
# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1)
# Run the search
grid.fit(X_train, y_train)
Fitting 3 folds for each of 64 candidates, totalling 192 fits
GridSearchCV(cv=3,
estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(random_state=42))]),
param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
21, 33, 51],
'randomforestregressor__n_estimators': [5, 10, 15, 20,
30, 50, 100,
200]},
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=3,
estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(random_state=42))]),
param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
21, 33, 51],
'randomforestregressor__n_estimators': [5, 10, 15, 20,
30, 50, 100,
200]},
verbose=1)Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(random_state=42))])StandardScaler()
RandomForestRegressor(random_state=42)
# The best estimator
grid.best_estimator_
Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(max_depth=7, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor(max_depth=7, random_state=42))])StandardScaler()
RandomForestRegressor(max_depth=7, random_state=42)
# The best hyper parameters
grid.best_params_
{'randomforestregressor__max_depth': 7,
'randomforestregressor__n_estimators': 100}
We'll define a helper utility function to calculate the accuracy in terms of the mean absolute percent error
def evaluate_mape(model, X_test, y_test):
"""
Given a model and test features/targets, print out the
mean absolute error and accuracy
"""
# Make the predictions
predictions = model.predict(X_test)
# Absolute error
errors = abs(predictions - y_test)
avg_error = np.mean(errors)
# Mean absolute percentage error
mape = 100 * np.mean(errors / y_test)
# Accuracy
accuracy = 100 - mape
print("Model Performance")
print(f"Average Absolute Error: {avg_error:0.4f}")
print(f"Accuracy = {accuracy:0.2f}%.")
return accuracy
# Setup the pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())
# Fit on train set
linear.fit(X_train, y_train)
# Evaluate on test set
evaluate_mape(linear, X_test, y_test)
Model Performance Average Absolute Error: 0.3281 Accuracy = 94.93%.
94.92864894582036
# Initialize the pipeline
base_model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
# Fit the training set
base_model.fit(X_train, y_train)
# Evaluate on the test set
base_accuracy = evaluate_mape(base_model, X_test, y_test)
Model Performance Average Absolute Error: 0.2291 Accuracy = 96.47%.
Small improvement!
# Evaluate the best random forest model
best_random = grid.best_estimator_
random_accuracy = evaluate_mape(best_random, X_test, y_test)
# What's the improvement?
improvement = 100 * (random_accuracy - base_accuracy) / base_accuracy
print(f'Improvement of {improvement:0.4f}%.')
Model Performance Average Absolute Error: 0.2288 Accuracy = 96.47%. Improvement of 0.0025%.
cross_val_scoreGridSearchCVIn this part, we'll use a random forest model and housing data from the Office of Property Assessment to predict residential sale prices in Philadelphia
Note: We'll focus on the first two components in this analysis (and in assignment #7)
Too often, these models perpetuate inequality: low-value homes are over-assessed and high-value homes are under-assessed
Let's download data for properties in Philadelphia that had their last sale during 2021 (the last full calendar year)
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2021 sales for single family residential properties
where = "sale_date >= '2021-01-01' and sale_date <= '2021-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
salesRaw.head()
| geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | census_tract | central_air | cross_reference | date_exterior_condition | depth | exempt_building | exempt_land | exterior_condition | fireplaces | frontage | fuel | garage_spaces | garage_type | general_construction | geographic_ward | homestead_exemption | house_extension | house_number | interior_condition | location | mailing_address_1 | mailing_address_2 | mailing_care_of | mailing_city_state | mailing_street | mailing_zip | market_value | market_value_date | number_of_bathrooms | number_of_bedrooms | number_of_rooms | number_stories | off_street_open | other_building | owner_1 | owner_2 | parcel_number | parcel_shape | quality_grade | recording_date | registry_number | sale_date | sale_price | separate_utilities | sewer | site_type | state_code | street_code | street_designation | street_direction | street_name | suffix | taxable_building | taxable_land | topography | total_area | total_livable_area | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | pin | building_code_new | building_code_description_new | objectid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | POINT (-75.19667 39.93733) | 982 | 2021-07-16T00:00:00Z | None | 164' S WHARTON ST | 54096200 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 33 | None | None | None | 62.0 | 0 | 0 | 4 | 0.0 | 16.0 | None | 0.0 | None | A | 36 | 0.0 | None | 1319 | 4 | 1319 S 32ND ST | None | None | None | PHILADELPHIA PA | 1319 S 32ND ST | 19146-3409 | 134100 | None | 1.0 | 3.0 | NaN | 2.0 | 761.0 | None | HAMMOND FREDERICK HORACE | HAMMOND TERRY R | 362299500 | E | C | 2022-09-12T00:00:00Z | 010S080153 | 2021-12-23T00:00:00Z | 1.0 | None | None | None | PA | 88430 | ST | S | 32ND | None | 107280 | 26820 | F | 992.0 | 1350.0 | None | None | None | None | I | 1925 | Y | 19146 | RSA5 | 1001649631 | 22 | ROW TYPICAL | 221102782 |
| 1 | POINT (-75.16623 39.94166) | 1012 | 2022-03-09T00:00:00Z | A | SWC BROAD & FITZWATER STS | 53950012 | SC | VACANT LAND COMMER < ACRE | 1 | SINGLE FAMILY | 14 | Y | None | None | 48.0 | 1200800 | 0 | 1 | NaN | 18.0 | None | 2.0 | 1 | B | 30 | 0.0 | None | 742 | 1 | 742 S BROAD ST | UNIT#9 | None | None | PHILADELPHIA PA | 742 S BROAD STREET | 19146 | 1501000 | None | 3.0 | 4.0 | NaN | 3.0 | NaN | None | MARSHALL ARIELA LUCY | KAPA SURAJ | 301262704 | E | None | 2022-11-19T00:00:00Z | 006S010392 | 2021-11-10T00:00:00Z | 1695000.0 | None | None | None | PA | 19160 | ST | S | BROAD | None | 0 | 300200 | F | 864.0 | 3529.0 | None | None | None | None | I | 2021 | None | 19146 | CMX3 | 1001681202 | 25 | ROW MODERN | 221102739 |
| 2 | POINT (-75.02846 40.11039) | 1086 | 2021-07-16T00:00:00Z | F | 89'11 1/2" NE SEL459 | 54095903 | K30 | S/D W/B GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 357 | None | None | None | 97.0 | 0 | 0 | 4 | 0.0 | 25.0 | None | 1.0 | None | A | 58 | 0.0 | None | 10229 | 4 | 10229 SELMER PLZ | None | None | None | PHILADELPHIA PA | 10229 SELMER PLZ | 19116-3629 | 258900 | None | 0.0 | 0.0 | NaN | 2.0 | 6289.0 | None | ALHAJ SAID | None | 582460600 | E | C | 2022-09-09T00:00:00Z | 153N210213 | 2021-10-07T00:00:00Z | 1.0 | None | None | None | PA | 71672 | PLZ | None | SELMER | None | 207120 | 51780 | F | 2377.0 | 1354.0 | None | None | None | None | I | 1968 | Y | 19116 | RSA3 | 1001476645 | 27 | TWIN POST WAR | 221102881 |
| 3 | POINT (-75.14689 40.05641) | 1207 | 2021-07-16T00:00:00Z | H | 152'6" N 67TH AVE | 54095296 | R30 | ROW B/GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 267 | N | None | None | 76.0 | 80000 | 0 | 4 | 0.0 | 16.0 | None | 1.0 | None | A | 10 | 80000.0 | None | 6720 | 4 | 6720 N BOUVIER ST | None | None | None | PHILADELPHIA PA | 6720 N BOUVIER ST | 19126-2610 | 151200 | None | 1.0 | 3.0 | NaN | 2.0 | 2218.0 | None | BETHAY ANTOINETTE | None | 101076300 | E | C+ | 2022-09-08T00:00:00Z | 138N210282 | 2021-01-16T00:00:00Z | 1.0 | None | None | None | PA | 18500 | ST | N | BOUVIER | None | 40960 | 30240 | F | 1216.0 | 1260.0 | H | None | None | None | I | 1925 | Y | 19126 | RSA5 | 1001097499 | 24 | ROW PORCH FRONT | 221103000 |
| 4 | POINT (-75.12769 39.99808) | 1299 | 2021-07-16T00:00:00Z | D | 429'4" E FRONT ST | 54094828 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 176 | N | None | None | 65.0 | 0 | 0 | 4 | 0.0 | 15.0 | None | 0.0 | None | A | 7 | 0.0 | None | 154 | 4 | 154 E ALLEGHENY AVE | None | None | MAIANNA FLORES | PHILADELPHIA PA | 154 E ALLGHENY AVE | 19134 | 84300 | None | 1.0 | 3.0 | NaN | 2.0 | 765.0 | None | MENTLE MARIANA FLORES | GINEZ BENJAMIN | 071300400 | E | C | 2022-09-07T00:00:00Z | 038N200029 | 2021-10-12T00:00:00Z | 18200.0 | None | None | None | PA | 12040 | AVE | E | ALLEGHENY | None | 67440 | 16860 | F | 1002.0 | 1593.0 | B | None | None | None | I | 1920 | Y | 19134 | RM1 | 1001058475 | 24 | ROW PORCH FRONT | 221103092 |
len(salesRaw)
25680
salesRaw.head()
| geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | census_tract | central_air | cross_reference | date_exterior_condition | depth | exempt_building | exempt_land | exterior_condition | fireplaces | frontage | fuel | garage_spaces | garage_type | general_construction | geographic_ward | homestead_exemption | house_extension | house_number | interior_condition | location | mailing_address_1 | mailing_address_2 | mailing_care_of | mailing_city_state | mailing_street | mailing_zip | market_value | market_value_date | number_of_bathrooms | number_of_bedrooms | number_of_rooms | number_stories | off_street_open | other_building | owner_1 | owner_2 | parcel_number | parcel_shape | quality_grade | recording_date | registry_number | sale_date | sale_price | separate_utilities | sewer | site_type | state_code | street_code | street_designation | street_direction | street_name | suffix | taxable_building | taxable_land | topography | total_area | total_livable_area | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | pin | building_code_new | building_code_description_new | objectid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | POINT (-75.19667 39.93733) | 982 | 2021-07-16T00:00:00Z | None | 164' S WHARTON ST | 54096200 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 33 | None | None | None | 62.0 | 0 | 0 | 4 | 0.0 | 16.0 | None | 0.0 | None | A | 36 | 0.0 | None | 1319 | 4 | 1319 S 32ND ST | None | None | None | PHILADELPHIA PA | 1319 S 32ND ST | 19146-3409 | 134100 | None | 1.0 | 3.0 | NaN | 2.0 | 761.0 | None | HAMMOND FREDERICK HORACE | HAMMOND TERRY R | 362299500 | E | C | 2022-09-12T00:00:00Z | 010S080153 | 2021-12-23T00:00:00Z | 1.0 | None | None | None | PA | 88430 | ST | S | 32ND | None | 107280 | 26820 | F | 992.0 | 1350.0 | None | None | None | None | I | 1925 | Y | 19146 | RSA5 | 1001649631 | 22 | ROW TYPICAL | 221102782 |
| 1 | POINT (-75.16623 39.94166) | 1012 | 2022-03-09T00:00:00Z | A | SWC BROAD & FITZWATER STS | 53950012 | SC | VACANT LAND COMMER < ACRE | 1 | SINGLE FAMILY | 14 | Y | None | None | 48.0 | 1200800 | 0 | 1 | NaN | 18.0 | None | 2.0 | 1 | B | 30 | 0.0 | None | 742 | 1 | 742 S BROAD ST | UNIT#9 | None | None | PHILADELPHIA PA | 742 S BROAD STREET | 19146 | 1501000 | None | 3.0 | 4.0 | NaN | 3.0 | NaN | None | MARSHALL ARIELA LUCY | KAPA SURAJ | 301262704 | E | None | 2022-11-19T00:00:00Z | 006S010392 | 2021-11-10T00:00:00Z | 1695000.0 | None | None | None | PA | 19160 | ST | S | BROAD | None | 0 | 300200 | F | 864.0 | 3529.0 | None | None | None | None | I | 2021 | None | 19146 | CMX3 | 1001681202 | 25 | ROW MODERN | 221102739 |
| 2 | POINT (-75.02846 40.11039) | 1086 | 2021-07-16T00:00:00Z | F | 89'11 1/2" NE SEL459 | 54095903 | K30 | S/D W/B GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 357 | None | None | None | 97.0 | 0 | 0 | 4 | 0.0 | 25.0 | None | 1.0 | None | A | 58 | 0.0 | None | 10229 | 4 | 10229 SELMER PLZ | None | None | None | PHILADELPHIA PA | 10229 SELMER PLZ | 19116-3629 | 258900 | None | 0.0 | 0.0 | NaN | 2.0 | 6289.0 | None | ALHAJ SAID | None | 582460600 | E | C | 2022-09-09T00:00:00Z | 153N210213 | 2021-10-07T00:00:00Z | 1.0 | None | None | None | PA | 71672 | PLZ | None | SELMER | None | 207120 | 51780 | F | 2377.0 | 1354.0 | None | None | None | None | I | 1968 | Y | 19116 | RSA3 | 1001476645 | 27 | TWIN POST WAR | 221102881 |
| 3 | POINT (-75.14689 40.05641) | 1207 | 2021-07-16T00:00:00Z | H | 152'6" N 67TH AVE | 54095296 | R30 | ROW B/GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 267 | N | None | None | 76.0 | 80000 | 0 | 4 | 0.0 | 16.0 | None | 1.0 | None | A | 10 | 80000.0 | None | 6720 | 4 | 6720 N BOUVIER ST | None | None | None | PHILADELPHIA PA | 6720 N BOUVIER ST | 19126-2610 | 151200 | None | 1.0 | 3.0 | NaN | 2.0 | 2218.0 | None | BETHAY ANTOINETTE | None | 101076300 | E | C+ | 2022-09-08T00:00:00Z | 138N210282 | 2021-01-16T00:00:00Z | 1.0 | None | None | None | PA | 18500 | ST | N | BOUVIER | None | 40960 | 30240 | F | 1216.0 | 1260.0 | H | None | None | None | I | 1925 | Y | 19126 | RSA5 | 1001097499 | 24 | ROW PORCH FRONT | 221103000 |
| 4 | POINT (-75.12769 39.99808) | 1299 | 2021-07-16T00:00:00Z | D | 429'4" E FRONT ST | 54094828 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 176 | N | None | None | 65.0 | 0 | 0 | 4 | 0.0 | 15.0 | None | 0.0 | None | A | 7 | 0.0 | None | 154 | 4 | 154 E ALLEGHENY AVE | None | None | MAIANNA FLORES | PHILADELPHIA PA | 154 E ALLGHENY AVE | 19134 | 84300 | None | 1.0 | 3.0 | NaN | 2.0 | 765.0 | None | MENTLE MARIANA FLORES | GINEZ BENJAMIN | 071300400 | E | C | 2022-09-07T00:00:00Z | 038N200029 | 2021-10-12T00:00:00Z | 18200.0 | None | None | None | PA | 12040 | AVE | E | ALLEGHENY | None | 67440 | 16860 | F | 1002.0 | 1593.0 | B | None | None | None | I | 1920 | Y | 19134 | RM1 | 1001058475 | 24 | ROW PORCH FRONT | 221103092 |
import missingno as msno
# We'll visualize the first half of columns
# and then the second half
ncol = len(salesRaw.columns)
fields1 = salesRaw.columns[:ncol//2]
fields2 = salesRaw.columns[ncol//2:]
# The first half of columns
msno.bar(salesRaw[fields1]);
# The second half of columns
msno.bar(salesRaw[fields2]);
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()
# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
19311
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
# Make a random forest pipeline
forest = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.36771032 0.31161862 0.27770105 0.37840209 0.32451637 0.33635098 0.35035929 0.35185047 0.31090724 0.34235852] Scores mean = 0.33517749465900903 Score std dev = 0.028371528460270915
# Fit on the training data
forest.fit(X_train, y_train)
# What's the test score?
forest.score(X_test, y_test)
0.38305072046176447
Model appears to generalize reasonably well!
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
regressor = forest["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{
"Feature": feature_cols,
"Importance": regressor.feature_importances_
}
).sort_values(by="Importance")
importance.hvplot.barh(x="Feature", y="Importance")
We can use a technique called one-hot encoding
Steps:
OneHotEncoder object is a preprocessor that will perform the vectorization stepColumnTransformer object will help us apply different transformers to numerical and categorical columnsfrom sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
Let's try out the example data of colors:
# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]
colors.shape
(4, 1)
colors
array([['red'],
['green'],
['blue'],
['red']], dtype='<U5')
# Initialize the OHE transformer
ohe = OneHotEncoder()
# Fit the transformer and then transform the colors
ohe.fit_transform(colors).toarray()
array([[0., 0., 1.],
[0., 1., 0.],
[1., 0., 0.],
[0., 0., 1.]])
# The corresponding category for each column
ohe.categories_
[array(['blue', 'green', 'red'], dtype='<U5')]
Let's apply separate transformers for our numerical and categorical columns:
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
# NEW: Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# NEW: et up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
Note: the handle_unknown='ignore' parameter ensures that if categories show up in our training set, but not our test set, no error will be raised.
Initialize the pipeline object, using the column transformer and the random forest regressor
# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=10,
random_state=42)
)
Now, let's fit the model.
train_set and test_setX_train and X_test numpy arrays.# Fit the training set
pipe.fit(train_set, y_train);
# What's the test score?
pipe.score(test_set, y_test)
0.5900285529835902
$R^2$ of ~0.38 improved to $R^2$ of ~0.59!
Takeaway: neighborhood based effects play a crucial role in determining housing prices.
Side Note: to fully validate the model we should run $k$-fold cross validation and optimize hyperparameters of the model as well...
This will be part of assignment #7
But first, we need to know the column names! The one-hot encoder created a column for each category type...
# The column transformer...
transformer
ColumnTransformer(transformers=[('num', StandardScaler(),
['total_livable_area', 'total_area',
'garage_spaces', 'fireplaces',
'number_of_bathrooms', 'number_of_bedrooms',
'number_stories']),
('cat', OneHotEncoder(handle_unknown='ignore'),
['exterior_condition', 'zip_code'])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ColumnTransformer(transformers=[('num', StandardScaler(),
['total_livable_area', 'total_area',
'garage_spaces', 'fireplaces',
'number_of_bathrooms', 'number_of_bedrooms',
'number_stories']),
('cat', OneHotEncoder(handle_unknown='ignore'),
['exterior_condition', 'zip_code'])])['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']
StandardScaler()
['exterior_condition', 'zip_code']
OneHotEncoder(handle_unknown='ignore')
# The steps in the column transformer
transformer.named_transformers_
{'num': StandardScaler(),
'cat': OneHotEncoder(handle_unknown='ignore'),
'remainder': 'drop'}
# The one-hot step
ohe = transformer.named_transformers_['cat']
ohe
OneHotEncoder(handle_unknown='ignore')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(handle_unknown='ignore')
# One column for each category type!
ohe_cols = ohe.get_feature_names_out()
ohe_cols
array(['exterior_condition_0', 'exterior_condition_1',
'exterior_condition_2', 'exterior_condition_3',
'exterior_condition_4', 'exterior_condition_5',
'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102',
'zip_code_19103', 'zip_code_19104', 'zip_code_19106',
'zip_code_19107', 'zip_code_19111', 'zip_code_19114',
'zip_code_19115', 'zip_code_19116', 'zip_code_19118',
'zip_code_19119', 'zip_code_19120', 'zip_code_19121',
'zip_code_19122', 'zip_code_19123', 'zip_code_19124',
'zip_code_19125', 'zip_code_19126', 'zip_code_19127',
'zip_code_19128', 'zip_code_19129', 'zip_code_19130',
'zip_code_19131', 'zip_code_19132', 'zip_code_19133',
'zip_code_19134', 'zip_code_19135', 'zip_code_19136',
'zip_code_19137', 'zip_code_19138', 'zip_code_19139',
'zip_code_19140', 'zip_code_19141', 'zip_code_19142',
'zip_code_19143', 'zip_code_19144', 'zip_code_19145',
'zip_code_19146', 'zip_code_19147', 'zip_code_19148',
'zip_code_19149', 'zip_code_19150', 'zip_code_19151',
'zip_code_19152', 'zip_code_19153', 'zip_code_19154'], dtype=object)
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
features
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories', 'exterior_condition_0', 'exterior_condition_1', 'exterior_condition_2', 'exterior_condition_3', 'exterior_condition_4', 'exterior_condition_5', 'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102', 'zip_code_19103', 'zip_code_19104', 'zip_code_19106', 'zip_code_19107', 'zip_code_19111', 'zip_code_19114', 'zip_code_19115', 'zip_code_19116', 'zip_code_19118', 'zip_code_19119', 'zip_code_19120', 'zip_code_19121', 'zip_code_19122', 'zip_code_19123', 'zip_code_19124', 'zip_code_19125', 'zip_code_19126', 'zip_code_19127', 'zip_code_19128', 'zip_code_19129', 'zip_code_19130', 'zip_code_19131', 'zip_code_19132', 'zip_code_19133', 'zip_code_19134', 'zip_code_19135', 'zip_code_19136', 'zip_code_19137', 'zip_code_19138', 'zip_code_19139', 'zip_code_19140', 'zip_code_19141', 'zip_code_19142', 'zip_code_19143', 'zip_code_19144', 'zip_code_19145', 'zip_code_19146', 'zip_code_19147', 'zip_code_19148', 'zip_code_19149', 'zip_code_19150', 'zip_code_19151', 'zip_code_19152', 'zip_code_19153', 'zip_code_19154']
random_forest = pipe["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": random_forest.feature_importances_}
).sort_values(by="Importance", ascending=False)
importance.head(n=20)
| Feature | Importance | |
|---|---|---|
| 0 | total_livable_area | 0.219391 |
| 1 | total_area | 0.182213 |
| 4 | number_of_bathrooms | 0.116660 |
| 46 | zip_code_19140 | 0.059798 |
| 38 | zip_code_19132 | 0.049526 |
| 40 | zip_code_19134 | 0.047761 |
| 39 | zip_code_19133 | 0.037957 |
| 5 | number_of_bedrooms | 0.032194 |
| 48 | zip_code_19142 | 0.020377 |
| 6 | number_stories | 0.017484 |
| 14 | exterior_condition_7 | 0.014137 |
| 2 | garage_spaces | 0.011424 |
| 30 | zip_code_19124 | 0.011310 |
| 37 | zip_code_19131 | 0.010841 |
| 10 | exterior_condition_3 | 0.010744 |
| 12 | exterior_condition_5 | 0.010726 |
| 11 | exterior_condition_4 | 0.010458 |
| 45 | zip_code_19139 | 0.009919 |
| 53 | zip_code_19147 | 0.008496 |
| 49 | zip_code_19143 | 0.008411 |
# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]
# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)